/***
This do-file produces the Median 2-Bedroom Rent ZIP-level data we use from the 
CSV downloaded from Census.
***/

*-------------------------------------------------------------------------------
* Set up
*-------------------------------------------------------------------------------

* Set $root
project figstabs, root
if (r(buildrunning)==0) include "${root}/code/config_interactive.do"

* Create required folders
cap mkdir "${root}/data/derived/ACS 2014-2018 5-Year ZCTA"
cap mkdir "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables"

*-------------------------------------------------------------------------------
* Load and clean raw data
*-------------------------------------------------------------------------------

cd "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables"

project, uses("${root}/data/dvc/ACS 2014-2018 5-Year ZCTA/Median 2BR Rent/ACSDT5Y2018.B25031_2020-08-21T102104.zip") raw
unzipfile "${root}/data/dvc/ACS 2014-2018 5-Year ZCTA/Median 2BR Rent/ACSDT5Y2018.B25031_2020-08-21T102104.zip", replace

project, uses("${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACSDT5Y2018.B25031_data_with_overlays_2020-08-21T102029.csv")
import delimited "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACSDT5Y2018.B25031_data_with_overlays_2020-08-21T102029.csv", varnames(1) clear

* Remove sections of these variables other than the 5-digit ZCTA
replace geo_id = subinstr(geo_id, "8600000US", "", .)
replace name = subinstr(name, "ZCTA5 ", "", .)

* Drop row containing descriptions of variables
drop in 1

* Variables are now identical so one can be dropped and the remaining one renamed to "zcta5"
assert geo_id == name
drop geo_id
rename name zcta5
assert strlen(zcta5) == 5

* Keeping only the ZCTA variable and the median 2BR rent variable
rename b25031_004e med_2br_2014_2018_est
keep zcta5 med_2br_2014_2018_est

* Replace non-numeric missing values with an empty string and non-numeric approximate values with the nearest 
replace med_2br_2014_2018_est = "" if med_2br_2014_2018_est == "-"
replace med_2br_2014_2018_est = "99" if med_2br_2014_2018_est == "100-"
replace med_2br_2014_2018_est = "3501" if med_2br_2014_2018_est == "3,500+"

* Convert all variables to numeric
destring zcta5, replace
destring med_2br_2014_2018_est, replace

*-------------------------------------------------------------------------------
* Save file
*-------------------------------------------------------------------------------

save "${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACS 2014-2018 Median 2BR Rent.dta", replace
project, creates("${root}/data/derived/ACS 2014-2018 5-Year ZCTA/Individual Variables/ACS 2014-2018 Median 2BR Rent.dta")
